
########################
# 5. REATTACH DATA
########################

library(fastLink)
library(data.table)
library(here)

i_am("5_reattach_data.R")
rm(list=ls())

# set primary directory and folders
destination <- here()
temp.folder <- paste0(destination,'temporary_files/')

# set working directory
setwd(destination) 

## aggregate files: starting with main

# aggregate matches
matched.dat.files <- list.files(paste0(temp.folder))[grepl("_matched",
                                                                          list.files(paste0(temp.folder)))]

unmatched.dat.files <- list.files(paste0(temp.folder))[grepl("_unmatched",
                                                                            list.files(paste0(temp.folder)))]

matched.dat <- list()
for(i in matched.dat.files){
  load(paste0('./output/',i))
  matched.dat[[i]] <- matched.subset
  cat(i,"\n")
}
matched.dat <- rbindlist(matched.dat,fill = TRUE)

unmatched.dat <- list()
for(i in unmatched.dat.files){
  load(paste0('./output/',i))
  unmatched.dat[[i]] <- unmatched.subset
  cat(i,"\n")
}
unmatched.dat <- rbindlist(unmatched.dat,fill = TRUE)


# aggregate summary object
em.files <- list.files(paste0(temp.folder))[grepl("_EM_object",
                                                                 list.files(paste0(temp.folder)))]
match.summary <- list()
for(i in em.files){
  load(paste0('./output/',i))
  match.summary[[i]] <- em.object
  cat(i,"\n")
}
match.summary <- summary(aggregateEM(em.list=match.summary)) # summarize match info

## save files (aggregated output)
save(matched.dat, file= paste0(temp.folder,'matched_dat_agg.RData')) # temporary
save(unmatched.dat, file= paste0(temp.folder,'unmatched_dat_agg.RData')) # temporary
save(match.summary, file='./output/match_summary.RData')



## aggregate files: moving on to small

# aggregate matches
matched.dat.files <- list.files(paste0(temp.folder,'small_match/'))[grepl("_matched",
                                                                          list.files(paste0(temp.folder,'small_match/')))]

unmatched.dat.files <- list.files(paste0(temp.folder,'small_match/'))[grepl("_unmatched",
                                                                            list.files(paste0(temp.folder,'small_match/')))]

matched.dat <- list()
for(i in matched.dat.files){
  load(paste0('./output/small_match/',i))
  matched.dat[[i]] <- matched.subset
  cat(i,"\n")
}
matched.dat <- rbindlist(matched.dat,fill = TRUE)

unmatched.dat <- list()
for(i in unmatched.dat.files){
  load(paste0('./output/small_match/',i))
  unmatched.dat[[i]] <- unmatched.subset
  cat(i,"\n")
}
unmatched.dat <- rbindlist(unmatched.dat,fill = TRUE)


# aggregate summary object
em.files <- list.files(paste0(temp.folder,'small_match/'))[grepl("_EM_object",
                                                                 list.files(paste0(temp.folder,'small_match/')))]
match.summary <- list()
for(i in em.files){
  load(paste0('./output/small_match/',i))
  match.summary[[i]] <- em.object
  cat(i,"\n")
}
match.summary <- summary(aggregateEM(em.list=match.summary)) # summarize match info

## save files (aggregated output)
save(matched.dat, file= paste0(temp.folder,'small_matched_dat_agg.RData')) # temporary
save(unmatched.dat, file= paste0(temp.folder,'small_unmatched_dat_agg.RData')) # temporary
save(match.summary, file='./output/small_match_summary.RData')



#### now need to reattach voter file and MTO data that we have requested to be returned.
# For voter file data, we'll do this by opening unique file paths according to birth year, month, date, and gender
# will then matched by unique voter ID for each individual

### read temporary matched and unmatched data
load(paste0(temp.folder,'main_matched_dat_agg.RData'))
load(paste0(temp.folder,'main_unmatched_dat_agg.RData'))

### get auxiliary voter + turnout data by unique grouping of gender, year, and month
unique.sets <- unique(matched.dat[,c('gender','birthyear','birthmonth')])
unique.sets$gender.cat <- ifelse(unique.sets$gender=='f','female',
                                 ifelse(unique.sets$gender=='m','male',
                                        'unknown'))

### now will loop over each unique set
### this requires a multi-level loop because registration and birthdate will vary for each unique voter 
#### and number of elections vary by state
turnout.dat <- list()
for(i in 1:nrow(unique.sets)){
  this.one <- unique.sets[i,]
  
  # identify each observation in that unique set from the matched dataset
  obs <- matched.dat[gender==this.one$gender & 
                       birthyear==this.one$birthyear & 
                       birthmonth==this.one$birthmonth]
  full.dat <- NULL
  
  # extract unique state identifiers from voter id number because absentee voters may not be in "current_state" voter file
  # we'll use this state id to find individual turnout data
  states.incl <- obs$file_state
  
  cat(i, "of", nrow(unique.sets),"unique birthdates\n")
  
  # iterate over each state so we only open that state voter file
  for(j in unique(states.incl)){
    
    state.obs <- obs[file_state==j]
    load(paste0(paste('./final-r-files-auxiliary-data',this.one$gender.cat,
                      this.one$birthyear,this.one$birthmonth,j,sep='/'), '.RData'))
    
    aux.dat <- aux.dat[l2id%in%state.obs$l2id]
    
    # identify election vars to pull out dates for all, general, and primary elections respectively
    election.vars <- names(aux.dat)[grepl(paste(c('general','primary'), collapse='|'), names(aux.dat))]
    
    possible.elections <- as.Date(gsub("_","-",gsub("general_|primary_|presidential_primary_","",
                                                    election.vars[!grepl('consolidated',election.vars)]))) 
    
    dat <- list()
    
    # calculate turnout vars 
    for(n in unique(state.obs$l2id)){
      # for each individual, identify elections that are:
      #(1) after each individual turns 18 and is eligible to vote
      #(2) before/after random assignment 
      #(3) accounting for (or not) each voter's calculated registration date
      
      # total
      total.denominators <- paste(length(election.vars[which(possible.elections > as.Date(aux.dat[l2id==n,voters_birthdate]) + 18*365 &                                          
                                                               possible.elections < as.Date(paste('01', state.obs[l2id==n,mto.ra.month][1], 
                                                                                                  state.obs[l2id==n,mto.ra.year][1]), format='%d %m %Y'))]),
                                  length(election.vars[which(possible.elections > as.Date(aux.dat[l2id==n,voters_birthdate]) + 18*365 &                                           
                                                               possible.elections > as.Date(paste('01', state.obs[l2id==n,mto.ra.month][1], 
                                                                                                  state.obs[l2id==n,mto.ra.year][1]), format='%d %m %Y'))]),
                                  length(election.vars[which(possible.elections > as.Date(aux.dat[l2id==n, voters_birthdate]) + 18*365 &
                                                               possible.elections > as.Date(aux.dat[l2id==n,voters_calculatedregdate]) &
                                                               possible.elections > as.Date(paste('01', state.obs[l2id==n,mto.ra.month][1], 
                                                                                                  state.obs[l2id==n,mto.ra.year][1]), format='%d %m %Y'))]),
                                  sep="|")
      
      pretreat.turnout <- sum(aux.dat[l2id==n, names(aux.dat) %in% election.vars[which(possible.elections > as.Date(aux.dat[l2id==n,voters_birthdate]) + 18*365 &                                          
                                                                                         possible.elections < as.Date(paste('01', state.obs[l2id==n,mto.ra.month][1], 
                                                                                                                            state.obs[l2id==n,mto.ra.year][1]), format='%d %m %Y'))],
                                      with=F]=='Y', na.rm=T)/as.numeric(unlist(strsplit(total.denominators,"[|]"))[1])
      posttreat.turnout <- sum(aux.dat[l2id==n, names(aux.dat) %in% election.vars[which(possible.elections > as.Date(aux.dat[l2id==n,voters_birthdate]) + 18*365 &                                           
                                                                                          possible.elections > as.Date(paste('01', state.obs[l2id==n,mto.ra.month][1], 
                                                                                                                             state.obs[l2id==n,mto.ra.year][1]), format='%d %m %Y'))],
                                       with=F]=='Y', na.rm=T)/as.numeric(unlist(strsplit(total.denominators,"[|]"))[2])
      postreg.turnout <- sum(aux.dat[l2id==n, names(aux.dat) %in% election.vars[which(possible.elections > as.Date(aux.dat[l2id==n, voters_birthdate]) + 18*365 &
                                                                                        possible.elections > as.Date(aux.dat[l2id==n,voters_calculatedregdate]) &
                                                                                        possible.elections > as.Date(paste('01', state.obs[l2id==n,mto.ra.month][1], 
                                                                                                                           state.obs[l2id==n,mto.ra.year][1]), format='%d %m %Y'))],
                                     with=F]=='Y', na.rm=T)/as.numeric(unlist(strsplit(total.denominators,"[|]"))[3])      
      
      # keep only the aggregate turnout vars and voter information, drop individual elections
      dat<- data.table(state.obs[l2id==n],aux.dat[l2id==n,!names(aux.dat) %in% c(election.vars,"current_state","file_state","l2id"), with=F],
                       pretreat.turnout, posttreat.turnout, postreg.turnout,total.denominators)
      
    }
    # bind all observations together
    full.dat <- rbind(dat,full.dat)
    cat(j, "...")
  }
  cat("done\n")
  turnout.dat[[i]] <- full.dat
}

### aggregate
matched.final <- rbindlist(turnout.dat)

### create random id variable for each individual 
#### this us allow us to be able to identify non-unique matches later once identifying information is removed but not trace it back
id.vars <- data.frame(l2id=unique(matched.final$l2id),
                      l2.id=sample(1:length(unique(matched.final$l2id)),
                                   length(unique(matched.final$l2id)), replace=F))

matched.final <- merge(matched.final,id.vars, by= "l2id", all.x=T)

### merge in contemporary census data (averaged)
tract.dat <- read.csv('tract_level_data.csv', stringsAsFactors = F)
matched.final <- data.table(merge(data.frame(matched.final), tract.dat, by=c('census_tract','county_fips','current_state'), all.x=T))

### return only requested variables and drop individual- and tract-level identifying information, etc. (be sure to include original ID var)
drop.these <- c('mto.first','mto.last','mto.middle','mto.suffix', 'mto.birthmonth','mto.birthday','mto.birthyear','mto.gender',
                'l2id','first','middle','last','suffix', 'birthyear','birthmonth','birthday',"voters_birthdate",
                'census_tract','county_fips')

matched.final <- matched.final[,!names(matched.final) %in% drop.these, with=F]
unmatched.final <- unmatched.dat[,!names(unmatched.dat) %in% drop.these, with=F]

names(matched.final) ### check that all identifiers are removed
names(unmatched.final) ### check this one too


### Files for matched dataset with propensity scores and linked data, and unmatched MTO participants
### Output is anonymized 
save(matched.final, file="./output/main_matched_data_final.RData")
save(unmatched.final, file="./output/main_unmatched_data_final.RData")
write.csv(matched.final, './output/main_matched_data_final.csv',row.names =F)
write.csv(unmatched.final, './output/main_unmatched_data_final.csv',row.names =F)



### now with small cluster match
### read temporary matched and unmatched data
load(paste0(temp.folder,'small_matched_dat_agg.RData'))
load(paste0(temp.folder,'small_unmatched_dat_agg.RData'))

### get auxiliary voter + turnout data by unique grouping of gender, year, and month
unique.sets <- unique(matched.dat[,c('gender','birthyear','birthmonth')])
unique.sets$gender.cat <- ifelse(unique.sets$gender=='f','female',
                                 ifelse(unique.sets$gender=='m','male',
                                        'unknown'))

### now will loop over each unique set
### this requires a multi-level loop because registration and birthdate will vary for each unique voter 
#### and number of elections vary by state
turnout.dat <- list()
for(i in 1:nrow(unique.sets)){
  this.one <- unique.sets[i,]
  
  # identify each observation in that unique set from the matched dataset
  obs <- matched.dat[gender==this.one$gender & 
                       birthyear==this.one$birthyear & 
                       birthmonth==this.one$birthmonth]
  full.dat <- NULL
  
  # extract unique state identifiers from voter id number because absentee voters may not be in "current_state" voter file
  # we'll use this state id to find individual turnout data
  states.incl <- obs$file_state
  
  cat(i, "of", nrow(unique.sets),"unique birthdates\n")
  
  # iterate over each state so we only open that state voter file
  for(j in unique(states.incl)){
    
    state.obs <- obs[file_state==j]
    load(paste0(paste('./final-r-files-auxiliary-data',this.one$gender.cat,
                      this.one$birthyear,this.one$birthmonth,j,sep='/'), '.RData'))
    
    aux.dat <- aux.dat[l2id%in%state.obs$l2id]
    
    # identify election vars to pull out dates for all, general, and primary elections respectively
    election.vars <- names(aux.dat)[grepl(paste(c('general','primary'), collapse='|'), names(aux.dat))]
    
    possible.elections <- as.Date(gsub("_","-",gsub("general_|primary_|presidential_primary_","",
                                                    election.vars[!grepl('consolidated',election.vars)]))) 
    
    dat <- list()
    
    # calculate turnout vars 
    for(n in unique(state.obs$l2id)){
      # for each individual, identify elections that are:
      #(1) after each individual turns 18 and is eligible to vote
      #(2) before/after random assignment 
      #(3) accounting for (or not) each voter's calculated registration date
      
      # total
      total.denominators <- paste(length(election.vars[which(possible.elections > as.Date(aux.dat[l2id==n,voters_birthdate]) + 18*365 &                                          
                                                               possible.elections < as.Date(paste('01', state.obs[l2id==n,mto.ra.month][1], 
                                                                                                  state.obs[l2id==n,mto.ra.year][1]), format='%d %m %Y'))]),
                                  length(election.vars[which(possible.elections > as.Date(aux.dat[l2id==n,voters_birthdate]) + 18*365 &                                           
                                                               possible.elections > as.Date(paste('01', state.obs[l2id==n,mto.ra.month][1], 
                                                                                                  state.obs[l2id==n,mto.ra.year][1]), format='%d %m %Y'))]),
                                  length(election.vars[which(possible.elections > as.Date(aux.dat[l2id==n, voters_birthdate]) + 18*365 &
                                                               possible.elections > as.Date(aux.dat[l2id==n,voters_calculatedregdate]) &
                                                               possible.elections > as.Date(paste('01', state.obs[l2id==n,mto.ra.month][1], 
                                                                                                  state.obs[l2id==n,mto.ra.year][1]), format='%d %m %Y'))]),
                                  sep="|")
      
      pretreat.turnout <- sum(aux.dat[l2id==n, names(aux.dat) %in% election.vars[which(possible.elections > as.Date(aux.dat[l2id==n,voters_birthdate]) + 18*365 &                                          
                                                                                         possible.elections < as.Date(paste('01', state.obs[l2id==n,mto.ra.month][1], 
                                                                                                                            state.obs[l2id==n,mto.ra.year][1]), format='%d %m %Y'))],
                                      with=F]=='Y', na.rm=T)/as.numeric(unlist(strsplit(total.denominators,"[|]"))[1])
      posttreat.turnout <- sum(aux.dat[l2id==n, names(aux.dat) %in% election.vars[which(possible.elections > as.Date(aux.dat[l2id==n,voters_birthdate]) + 18*365 &                                           
                                                                                          possible.elections > as.Date(paste('01', state.obs[l2id==n,mto.ra.month][1], 
                                                                                                                             state.obs[l2id==n,mto.ra.year][1]), format='%d %m %Y'))],
                                       with=F]=='Y', na.rm=T)/as.numeric(unlist(strsplit(total.denominators,"[|]"))[2])
      postreg.turnout <- sum(aux.dat[l2id==n, names(aux.dat) %in% election.vars[which(possible.elections > as.Date(aux.dat[l2id==n, voters_birthdate]) + 18*365 &
                                                                                        possible.elections > as.Date(aux.dat[l2id==n,voters_calculatedregdate]) &
                                                                                        possible.elections > as.Date(paste('01', state.obs[l2id==n,mto.ra.month][1], 
                                                                                                                           state.obs[l2id==n,mto.ra.year][1]), format='%d %m %Y'))],
                                     with=F]=='Y', na.rm=T)/as.numeric(unlist(strsplit(total.denominators,"[|]"))[3])      
      
      # keep only the aggregate turnout vars and voter information, drop individual elections
      dat<- data.table(state.obs[l2id==n],aux.dat[l2id==n,!names(aux.dat) %in% c(election.vars,"current_state","file_state","l2id"), with=F],
                       pretreat.turnout, posttreat.turnout, postreg.turnout,total.denominators)
      
    }
    # bind all observations together
    full.dat <- rbind(dat,full.dat)
    cat(j, "...")
  }
  cat("done\n")
  turnout.dat[[i]] <- full.dat
}

### aggregate
matched.final <- rbindlist(turnout.dat)

### create random id variable for each individual 
#### this us allow us to be able to identify non-unique matches later once identifying information is removed but not trace it back
id.vars <- data.frame(l2id=unique(matched.final$l2id),
                      l2.id=sample(1:length(unique(matched.final$l2id)),
                                   length(unique(matched.final$l2id)), replace=F))

matched.final <- merge(matched.final,id.vars, by= "l2id", all.x=T)

### merge in contemporary census data (averaged)
tract.dat <- read.csv('tract_level_data.csv', stringsAsFactors = F)
matched.final <- data.table(merge(data.frame(matched.final), tract.dat, by=c('census_tract','county_fips','current_state'), all.x=T))

### return only requested variables and drop individual- and tract-level identifying information, etc. (be sure to include original ID var)
drop.these <- c('mto.first','mto.last','mto.middle','mto.suffix', 'mto.birthmonth','mto.birthday','mto.birthyear','mto.gender',
                'l2id','first','middle','last','suffix', 'birthyear','birthmonth','birthday',"voters_birthdate",
                'census_tract','county_fips')

matched.final <- matched.final[,!names(matched.final) %in% drop.these, with=F]
unmatched.final <- unmatched.dat[,!names(unmatched.dat) %in% drop.these, with=F]

names(matched.final) ### check that all identifiers are removed
names(unmatched.final) ### check this one too


### Files for matched dataset with propensity scores and linked data, and unmatched MTO participants
### Output is anonymized 
save(matched.final, file="./output/small_matched_data_final.RData")
save(unmatched.final, file="./output/small_unmatched_data_final.RData")
write.csv(matched.final, './output/small_matched_data_final.csv',row.names =F)
write.csv(unmatched.final, './output/small_unmatched_data_final.csv',row.names =F)


